import pandas as pd
# import plotly.graph_objects as go
import plotly.express as px
import statistics as stats
import os
import re
In diesem Notebook geht es um die Anwendung der Verfahren aus dem ersten Notebook auf eine ganze Reihe von Texten. Die berechneten Maßzahlen sollen außerdem visualisiert werden, um Vergleiche zu erleichtern.
Ich verwende hier Texte von 10 verschiedenen Autoren, die ich mit Scrapy von AO3 gesammelt habe. Pro Autor sind bis zu 20 Texte vorhanden. Die annotierten Texte befinden sich im Ordner tagged_stanza in der Datei ao3_authorship.7z. Damit der Code direkt ausführbar ist, speichert ihr den Inhalt des Archivs am besten in einem Ordner namens ao3_authorship.
Die Texte lassen sich – in der unannotierten Version (im Ordner raw) – auch gut für Experimente zur Urheberschaftsbestimmung mit Stylo verwenden. (Achtung: Wir betreiben hier nur distant reading. Richtiges Lesen der Texte auf eigene Gefahr!)
def ttr(n_types, n_tokens):
'''Type-token ratio'''
return n_types / n_tokens
def sttr(tokens, window_size=500):
'''Standardised type-token ratio:
Divide the text into sections of equal size, calculate TTR for each section, then return the arithmetic mean of all results.'''
results = []
for i in range(int(len(tokens) / window_size)):
window_tokens = tokens[i*window_size:i*window_size + window_size]
n_types = len(set(window_tokens))
results.append(ttr(n_types, window_size))
return stats.mean(results)
def mtld(tokens, factor_size=.72):
'''MTLD according to McCarthy & Jarvis (2010)
https://link.springer.com/article/10.3758/BRM.42.2.381'''
if isinstance(tokens, pd.Series):
tokens = tokens.tolist() # sonst funktioniert .reverse() nicht
def mtldsub(tokens, factor_size, reverse=False):
# Startwerte (types ist ein Set, kann also keine Duplikate enthalten):
factors = 0
types = set()
token_count = 0
if reverse:
tokens.reverse()
# Tokens durchgehen und der aktuellen Type-Menge hinzufügen:
for token in tokens:
types.add(token)
token_count += 1
# Falls TTR-Wert die festgelegte Faktorgröße erreicht oder unterschreitet,
# Faktorzahl erhöhen und neue Type-Liste beginnen:
if (len(types) / token_count) <= factor_size:
factors += 1
types = set()
token_count = 0
# Teilfaktor, falls am Ende noch Tokens übrig sind:
if token_count > 0:
TTR = len(types) / token_count
factors += (1 - TTR) / (1 - factor_size)
# Gesamttokenzahl durch Faktorzahl teilen und zurückgeben:
return len(tokens) / factors
# In beiden Richtungen durch den Text gehen, Mittelwert als Endergebnis zurückgeben:
mtld_forward = mtldsub(tokens, factor_size)
mtld_reverse = mtldsub(tokens, factor_size, reverse=True)
return stats.mean([mtld_forward, mtld_reverse])
def pos_freq(upos):
'''Expects a pandas.Series object as input'''
upos_freq = upos.value_counts()
upos_freq = upos_freq.to_frame()
upos_freq = upos_freq.reset_index()
upos_freq.columns = ['upos', 'count']
upos_freq = upos_freq.assign(rel=upos_freq['count'] / len(upos))
return upos_freq
def lex_density(upos_freq):
'''Lexical density (proportion of content words)'''
content_freq = upos_freq[upos_freq.upos.isin(['VERB', 'NOUN', 'PROPN', 'ADJ'])] # oder so: upos_freq.query("upos in ['VERB', 'NOUN', 'PROPN', 'ADJ']")
return content_freq['count'].sum() / upos_freq['count'].sum()
def avg_sentence_length(ids, stat='median'):
'''Expects a pandas.Series object with CoNLL token ids as input'''
sentence_lengths = []
lastid = ids.last_valid_index()
for index, value in ids.items():
if value == 1:
if index != 0:
sentence_lengths.append(sentence_length)
sentence_length = 1
else:
sentence_length += 1
if index == lastid:
sentence_lengths.append(sentence_length)
if stat == 'mean':
return stats.mean(sentence_lengths)
else:
return stats.median(sentence_lengths)
basedir = r'ao3_authorship/' # oder wo auch immer sonst sich die entpackten Dateien befinden
meta = pd.read_csv(basedir + r'meta.csv')
colnames = ['id', 'token', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']
with open(basedir + 'measures.tsv', 'w', encoding="UTF-8") as f:
# Header:
print('id', 'STTR_0250', 'STTR_0500', 'STTR_0750', 'STTR_1000', 'MTLD', 'Lexical_density', 'Avg_sentence_length', sep='\t', file=f)
with os.scandir(basedir + 'tagged-stanza/') as it:
for entry in it:
if entry.name.endswith(".tsv") and entry.is_file():
textid = int(re.search(r'([0-9]+)(.tsv)', entry.name).group(1))
text = pd.read_table(entry.path, names=colnames, quoting=3, keep_default_na=False)
# author = meta.loc[meta['id'] == textid, 'author'].values[0]
only_words = text.query('upos != "PUNCT"')
only_words = only_words.assign(token_upos=only_words['token'].str.lower() + "/" + only_words['upos'])
sttr_0250 = sttr(only_words['token_upos'], window_size=250)
sttr_0500 = sttr(only_words['token_upos'])
sttr_0750 = sttr(only_words['token_upos'], window_size=750)
sttr_1000 = sttr(only_words['token_upos'], window_size=1000)
mtld_ = mtld(only_words['token_upos'])
pos_freq_ = pos_freq(text['upos'])
lex_density_ = lex_density(pos_freq_)
avg_sntnc_length = avg_sentence_length(text['id'])
print(textid, sttr_0250, sttr_0500, sttr_0750, sttr_1000, mtld_, lex_density_, avg_sntnc_length, sep='\t', file=f)
measures = pd.read_table(basedir + 'measures.tsv', quoting=3)
measures
| id | STTR_0250 | STTR_0500 | STTR_0750 | STTR_1000 | MTLD | Lexical_density | Avg_sentence_length | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1006420 | 0.609671 | 0.512356 | 0.458091 | 0.421514 | 103.358394 | 0.297362 | 12.0 |
| 1 | 1016975 | 0.699000 | 0.629000 | 0.592000 | 0.558000 | 201.752275 | 0.438558 | 15.0 |
| 2 | 10290932 | 0.597481 | 0.512308 | 0.464889 | 0.436167 | 87.544242 | 0.358505 | 12.0 |
| 3 | 1042187 | 0.608923 | 0.520000 | 0.468333 | 0.432667 | 99.937333 | 0.376483 | 25.0 |
| 4 | 1042258 | 0.609667 | 0.521667 | 0.485000 | 0.448333 | 94.848898 | 0.391342 | 29.5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 187 | 9616364 | 0.623500 | 0.542333 | 0.492333 | 0.458833 | 106.110815 | 0.357025 | 14.0 |
| 188 | 961984 | 0.627145 | 0.531662 | 0.477953 | 0.442563 | 119.888950 | 0.361261 | 18.0 |
| 189 | 964610 | 0.627034 | 0.543571 | 0.497481 | 0.462429 | 111.811362 | 0.375217 | 13.0 |
| 190 | 9768917 | 0.618560 | 0.536833 | 0.491833 | 0.459833 | 103.610285 | 0.368611 | 13.0 |
| 191 | 984652 | 0.605000 | 0.509000 | 0.436000 | 0.413000 | 102.868232 | 0.350539 | 14.0 |
192 rows × 8 columns
Kombination mit den Metadaten:
measures = meta.merge(measures, on='id')
measures
| id | url | author | title | rating | archive_warnings | categories | fandoms | relationships | characters | ... | hits | summary | notes | STTR_0250 | STTR_0500 | STTR_0750 | STTR_1000 | MTLD | Lexical_density | Avg_sentence_length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1006420 | https://archiveofourown.org/works/1006420?view... | Ablissa | A Different Sort of Adventure | Teen And Up Audiences | No Archive Warnings Apply | F/M | Doctor Who,Doctor Who (2005),Doctor Who & Rela... | Tenth Doctor/Rose Tyler,Ninth Doctor/Rose Tyler | Tenth Doctor,Rose Tyler,Ninth Doctor | ... | 6124 | <p>"She was a breath of fresh air in his life,... | <p>\n (See the end of the chapter for... | 0.609671 | 0.512356 | 0.458091 | 0.421514 | 103.358394 | 0.297362 | 12.0 |
| 1 | 1074849 | https://archiveofourown.org/works/1074849?view... | Ablissa | Is She Happy? | General Audiences | No Archive Warnings Apply | F/M | Doctor Who,Doctor Who (2005),Doctor Who & Rela... | Tenth Doctor/Rose Tyler,Eleventh Doctor/Rose T... | Tenth Doctor,Eleventh Doctor,Rose Tyler,Rose T... | ... | 4470 | <p>On his darkest day, the Doctor meets a fami... | NaN | 0.577778 | 0.497000 | 0.432889 | 0.407500 | 73.042222 | 0.305998 | 8.0 |
| 2 | 5510432 | https://archiveofourown.org/works/5510432?view... | Ablissa | First Impressions (Perhaps I Was Wrong) | Mature | No Archive Warnings Apply | M/M | Phandom/The Fantastic Foursome (YouTube RPF) | Dan Howell/Phil Lester,Dan Howell & Phil Leste... | Dan Howell,Phil Lester,Chris Kendall,PJ Liguor... | ... | 84173 | <p>Phil Lester goes back to university for his... | <p>\n (See the end of the chapter for... | 0.630932 | 0.536614 | 0.484333 | 0.447561 | 122.527927 | 0.338135 | 14.0 |
| 3 | 5747662 | https://archiveofourown.org/works/5747662?view... | Ablissa | Secrets We Didn't Need To Keep | Teen And Up Audiences | No Archive Warnings Apply | M/M | Phandom/The Fantastic Foursome (YouTube RPF) | Dan Howell/Phil Lester,Dan Howell & Phil Lester | Dan Howell,Phil Lester,Louise Pentland Watson | ... | 22186 | <p>Dan Howell. Twenty-four. In love with his b... | <p>Hi! This is just a Phan one-shot I wrote a ... | 0.625556 | 0.532000 | 0.484444 | 0.447778 | 121.752209 | 0.340645 | 13.0 |
| 4 | 6366910 | https://archiveofourown.org/works/6366910?view... | Ablissa | It's Just A Formality | Teen And Up Audiences | No Archive Warnings Apply | M/M | Phandom/The Fantastic Foursome (YouTube RPF) | Dan Howell/Phil Lester,Dan Howell & Phil Lester | Dan Howell,Phil Lester | ... | 10023 | <p>"Well, I guess we've never really told you,... | NaN | 0.633538 | 0.540333 | 0.487333 | 0.454333 | 119.235779 | 0.329418 | 12.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 187 | 1080285 | https://archiveofourown.org/works/1080285?view... | pherede | If At First | Explicit | Creator Chose Not To Use Archive Warnings | M/M | The Hobbit RPF | Richard Armitage/Lee Pace | Richard Armitage,Lee Pace | ... | 4998 | <p>The breakup is painful. The phone calls are... | NaN | 0.582154 | 0.499333 | 0.451667 | 0.417000 | 78.944224 | 0.345091 | 11.0 |
| 188 | 1080354 | https://archiveofourown.org/works/1080354?view... | pherede | A Challenge | Explicit | Creator Chose Not To Use Archive Warnings | F/M,M/M | The Hobbit - All Media Types,The Hobbit - J. R... | Legolas Greenleaf/Tauriel,Fíli/Kíli | Kíli,Fíli,Legolas Greenleaf,Legolas,Tauriel | ... | 10993 | <p>Tauriel watches Fili and Kili in their pris... | <p>For <a href="http://dirtydwarfdick.tumblr.c... | 0.627000 | 0.527000 | 0.466667 | 0.439000 | 127.157230 | 0.368657 | 25.0 |
| 189 | 1115620 | https://archiveofourown.org/works/1115620?view... | pherede | The Last Alone | Explicit | Creator Chose Not To Use Archive Warnings | M/M | The Hobbit - All Media Types,The Hobbit - J. R... | Thorin Oakenshield/Thranduil | Thorin Oakenshield,Thranduil | ... | 14412 | <p>Thorin is victorious, and the terms of Mirk... | NaN | 0.627304 | 0.546909 | 0.505333 | 0.469200 | 107.587834 | 0.398634 | 29.0 |
| 190 | 1362487 | https://archiveofourown.org/works/1362487?view... | pherede | Coming Of Age | Explicit | Creator Chose Not To Use Archive Warnings | M/M | The Hobbit - All Media Types,The Lord of the R... | Oropher/Thranduil,Galion/Thranduil,Oropher/Thr... | Oropher,Thranduil,Galion | ... | 8492 | <p>Thranduil is coming of age, and as the cust... | <p>Written for a prompt on the kink meme.</p>\... | 0.597636 | 0.516636 | 0.470952 | 0.439091 | 90.612645 | 0.392402 | 24.0 |
| 191 | 1610285 | https://archiveofourown.org/works/1610285?view... | pherede | Lain Low | Explicit | Graphic Depictions Of Violence | M/M | The Hobbit - All Media Types,The Hobbit - J. R... | Thorin Oakenshield/Thranduil,Bilbo Baggins/Sma... | Thorin Oakenshield,Thranduil,Bilbo Baggins,Sau... | ... | 20868 | <p>"Mirkwood has fallen, and the Necromancer (... | <p>\n (See the end of the chapter for... | 0.614208 | 0.533851 | 0.489692 | 0.459515 | 101.767795 | 0.384886 | 20.0 |
192 rows × 30 columns
Einige der gesammelten Texte stammen nicht allein von den Autoren, die uns hier interessieren. Vorerst wollen wir also Kollaborationen ausfiltern:
measures = measures[~measures.author.isin(['Alma_Anor,merripestin', 'Argus_Persa,Bibibabubi,emma_screams,Kidhuzural,Ulan', 'emma_screams,Ulan'])]
Wie hängen die Maße miteinander zusammen? Rein visuell können wir das anhand von Streudiagrammen untersuchen. Plotly Express stellt netterweise eine Funktion bereit, mit der wir gleich alle Kombinationen von Variablen auf einmal darstellen können.
fig = px.scatter_matrix(measures,
dimensions=["STTR_0250", "STTR_0500", "STTR_0750", "STTR_1000", "MTLD", "Lexical_density", "Avg_sentence_length"],
opacity=.4, # Transparenz
#color="author", # Autoren farblich unterscheiden?
labels={col:col.replace('_', ' ') for col in measures.columns}) # Labels verschönern
fig.update_traces(diagonal_visible=False)
fig.update_layout(
title="Zusammenhänge zwischen den Maßen",
template="ggplot2",
height=1000,
width=1000
)
fig.show()
Um die Werte für einzelne Autoren zu vergleichen, können wir z.B. Boxplots verwenden.
Boxplots kombinieren zentrale Tendenz, Streuungsmaß und Ausreißer in einer Darstellung:
fig = px.box(measures, x="author", y="Avg_sentence_length")
fig.update_layout(
title="Autorenvergleich: Satzlängen",
xaxis_title="Autor",
yaxis_title="Median der Satzlängen",
template="ggplot2"
)
# Anpassung der Darstellung:
fig.update_traces(line=dict(width=1,
color="black",
),
fillcolor="white",
marker=dict(size=5,
color="rgba(0, 0, 0, 0.5)",
line=dict(width=1,
color="black")
)
)
fig.show()
fig = px.box(measures, x="author", y="Lexical_density")
fig.update_layout(
title="Autorenvergleich: lexikalische Dichte",
xaxis_title="Autor",
yaxis_title="Lexikalische Dichte",
template="ggplot2"
)
# Anpassung der Darstellung:
fig.update_traces(line=dict(width=1,
color="black",
),
fillcolor="white",
marker=dict(size=5,
color="rgba(0, 0, 0, 0.5)",
line=dict(width=1,
color="black")
)
)
fig.show()
fig = px.box(measures, x="author", y="STTR_1000")
fig.update_layout(
title="Autorenvergleich: STTR",
xaxis_title="Autor",
yaxis_title="STTR (Fenstergröße 1000)",
template="ggplot2",
height=600
)
# Anpassung der Darstellung:
fig.update_traces(line=dict(width=1,
color="black",
),
fillcolor="white",
marker=dict(size=5,
color="rgba(0, 0, 0, 0.5)",
line=dict(width=1,
color="black")
)
)
fig.show()